---
title: 'The Methodological Divide of Sociology<br>R Code Documentation'
output:
  html_notebook:
    code_folding: hide
    theme: readable
    toc: yes
    toc_float:
      collapsed: no
      smooth_scroll: no
  html_document:
    df_print: paged
    toc: yes
---


```{r setup, include=FALSE}
#adjust your working directory here
knitr::opts_knit$set(root.dir = '/home/cs/Dropbox/kasus/PhD/projects/soc_journals')
```


```{r, message=FALSE, warning=FALSE}
# setting seed
set.seed(1337)

# loading libraries and preparing data for analysis
library(jaod)
library(tidyverse)
library(hrbrthemes)
library(quanteda)
library(spacyr) # see https://github.com/quanteda/spacyr for spacyr installation
library(ggrepel)
library(cowplot)
library(stargazer)
```




## Case selection

This document includes analysis for articles published in **Generalist Sociology Journals between 1995 and 2017**. The selection of journals is based upon the following criteria:


* Journals that achieved high ranks in the latest [Social Science Citation Index](https://en.wikipedia.org/wiki/Social_Sciences_Citation_Index) (SSCI).
* Journals that are considered to be generalist Sociology journals without either a theoretical / methodological (e.g. [Sociological Methods and Research](http://journals.sagepub.com/home/smr)) focus, or a domain-specific focus (e.g. [Journal of Marriage and Family](http://onlinelibrary.wiley.com/journal/10.1111/(ISSN)1741-3737)).
* Journal for which abstract texts are available in databases from 1995 to 2017.

In addition, the new generalist and journal [Sociological Science](https://www.sociologicalscience.com/) is included to see how open-access  publications differ in comparison to traditional journals in the field. As there is no *well defined population* of generalist Sociology journals, this case selection is obviously to some extent subjective.

The dataset includes the following journals:

* Annual Review of Sociology
* American Sociological Review
* American journal of Sociology
* European Sociological Review
* Social Forces
* Social Problems
* Sociology
* British Journal of Sociology
* Social Science Research
* Acta Sociologica
* Sociological Quarterly
* Sociological Forum
* Sociological Science 


## Data preparation

Creating the dataset consisted of several steps:

* Extracting data for Sociological Science via the [Directory of Open Access Journals](https://doaj.org/).


```{r message=FALSE, warning=FALSE, results='hide', eval = FALSE}
# extracts data for socsci from the doaj

socsci <- jaod_article_search(query = "issn:2330-6696", pageSize = 100,
                            page = 1)$result %>% bind_rows(
jaod_article_search(query = "issn:2330-6696", pageSize = 100,
                            page = 2)$results)

socsci <- socsci %>% transmute(
  eid = id, 
  year = bibjson.year %>% as.numeric(),
  abstract = bibjson.abstract,
  title = bibjson.title, 
  link = bibjson.link %>% map_chr(1),
  source = bibjson.journal.title %>% str_to_lower(),
  index_keywords = bibjson.keywords %>% 
    map_chr(str_c, collapse = '; '),
  authors = bibjson.author %>% map(~ pull(.x, name)),
  affiliations = bibjson.author %>% map(~ pull(.x, affiliation)),
  doi = bibjson.identifier %>% 
    map_chr(~ filter(.x, type == 'doi') %>% pull(id))
  ) %>% filter(year <= 2017)

socsci_write <- socsci %>%  mutate(authors = 
                   authors %>% 
                   map(str_c, collapse = '; ') %>% flatten_chr(),      
                   affiliations =     affiliations %>% 
                   map(str_c, collapse = '; ') %>%flatten_chr())

write_tsv(socsci_write, 'sociological_science.tsv')
```

* Extracting data for SSCI journals from the [Scopus](https://en.wikipedia.org/wiki/Scopus) database.
* Merging the data and applying some cleaning (e.g. different spellings for journals).

```{r message=FALSE, warning=FALSE, results = 'hide', eval = FALSE}
# reads in scopus data, normalizes names and merges with soc sci articles

df <- list.files('soc_journals_data/') %>% # path for publication files
  map(~ read_csv(str_c('soc_journals_data/', .)) %>% 
                             mutate_all(as.character) ) %>% bind_rows()%>% 
  set_names(snakecase::to_snake_case, sep_in = "\\.") %>%
  mutate(year = as.numeric(year), cites = as.numeric(cited_by),
         cites =  if_else(is.na(cites), 0, cites),
         # detect missing abstracts and authors
         abstract = replace(abstract, abstract == '[No abstract available]', 
                            NA),
         authors = replace(authors, authors == '[No author name available]', 
                            NA),
          source = str_to_lower(source_title), 
       type = str_to_lower(document_type)) %>% 
  select(eid, title, year,  cites, source,  authors,
           abstract,  affiliations, authors_with_affiliations,
         correspondence_address, author_keywords, index_keywords,
         funding_details, funding_text, references,  
          doi, link,  publisher, language_of_original_document, type) %>% 
  distinct(eid, .keep_all = TRUE) %>% 
  # unify journal names
   mutate(source = case_when(
                             source == 'ajs; american journal of sociology' ~
                               'american journal of sociology',
                             source == 'the british journal of sociology' ~
                               'british journal of sociology',
                             TRUE ~ source))

# combine and store data
socsci <- read_tsv('../sociological_science.tsv') %>% 
  mutate(type = 'article')

df <- bind_rows(df, socsci)
write_tsv(df, '../ssci_soc_journals_df.tsv')
```

After these steps, an additional round of preparation and cleaning follows:

* Only articles and articles in press are considered (no book reviews, errata, etc.).
* Only articles with abstracts are considered (true for most publications in this dataset).
* Noise within abstracts is removed (e.g. keywords and copyright entries).

```{r, message=FALSE, warning=FALSE, results = 'hide', eval = FALSE}
df <- read_tsv('ssci_soc_journals_df.tsv') %>% 
  # only keep articles
   filter(type %in% c('article', 'article in press')) %>% 
   filter(year <= 2017 & !is.na(abstract)) %>% # remove missing abstracts
   filter(source %in%  # remove out-of-sample observations
              c('american journal of sociology', 
                 'american sociological review', 
                'social forces', 'european sociological review', 
                'annual review of sociology', 'sociology', 
                'social problems', 'social science research', 
                'sociological science', 'british journal of sociology',
                'acta sociologica',
                'sociological quarterly', 'sociological forum')) %>% 
  distinct(abstract, .keep_all = TRUE) %>% 
  mutate(
 source = as.factor(source),
 # detect and remove keywords and copyrights in abstracts
 abstract = case_when(str_detect(abstract, 'Keywords:') ~ 
                        str_split(abstract, 'Keywords:') %>% map_chr(1),
                      TRUE ~ abstract),
 abstract = str_split(abstract, '©') %>% map_chr(1))

write_tsv(df, 'ssci_soc_journals_df_final.tsv')
```


## Data analysis

```{r message=FALSE, warning=FALSE, results = 'hide'}
# reloading the data frame
df <- read_tsv('ssci_soc_journals_df_final.tsv')
```


The dataset contains `r nrow(df)` publications. 




As the sample includes the most recognized journals in the field from 1995 onwards, some of the papers reached a very high number of citations. The following table shows the  most-cited papers in the dataset.

```{r}
df %>% 
    arrange(desc(cites)) %>% select(cites, title, authors, source, year) %>%
  head(100) %>%     DT::datatable(  options = list(
      lengthMenu = c(5, 10, 20)))
```


Moving away from single publications, the next plot shows the number of articles published per journal.

```{r fig.height=6, fig.width=9}
df %>% count(source) %>% 
  ggplot(aes(x = reorder(source, n) , y = n)) + geom_col() + coord_flip() +
  theme_ipsum(axis_title_size = 13, axis = 'Xx', grid = 'Xx')  + 
  scale_y_comma(breaks = scales::pretty_breaks(n = 6)) +
  labs(y = 'No. of Articles', x = 'Journal')

 # ggsave('output/socsci_articles_per_journal.png',
 #        width =  9, height = 6, dpi = 300)
```

It can be seen that Social Science Research and Social Forces published the most articles.  The number for Sociological Science is low because the journal first started publishing in 2014, while the other journals have been around for more than 20 years. As for the Annual Review of Sociology, the small share of publications is also unsurprising because reviews are not included in the dataset and the publication output of annual journals is naturally low.

## Text as data

The main part of this document deals with analysis on textual data. Because texts are messy, this also requires intensive preprocessing.

In a first step, all abstracts are parsed with the [spaCy](https://spacy.io/) library. While easier methods are available in several R packages, spaCy enables the [lemmatization](https://en.wikipedia.org/wiki/Lemmatisation) of words. Lemmatizaion is useful for combining words with similar semantic meaning (e.g. *walk, walked, walking*) to a single feature (e.g. *walk*).

```{r message=FALSE, warning=FALSE, results = 'hide', eval = FALSE}
# running the spacy parser and saving results
# spacy_install()
spacy_initialize(model = 'en' ,refresh_settings = TRUE)
parsed <- spacy_parse(df$abstract)
spacy_finalize()
save.image('soc_journals_spacy.RData')
```

After that, additional preprocessing steps are taken:

* converting all abstracts to the [bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model) format.
* removing stopwords (e.g. *the, and*) and empty strings from the lemma tokens.
* unifying tokens for different UK / US spellings.
* detecting and connecting collocations (e.g. *united states*).
* converting tokens into a document-feature matrix.
* removing terms which don't occur in at least 50 abstracts (important for robustness of results).


```{r message=FALSE, warning=FALSE, results = 'hide'}
load('soc_journals_spacy.RData') # loads lemma
 
 soc_tokens <- as.tokens(parsed, use_lemma = TRUE) %>% 
   tokens(remove_punct = TRUE, remove_numbers = TRUE) %>% 
     tokens_tolower() %>% 
  tokens_remove(c(stopwords('english'), 'copyright',
                  'author', 'authors', 'article','paper',
                  'review', 'reviews','research', '-pron-', "'s",
                  '’s'),  min_nchar = 1L,  padding = TRUE) 
 
UK <- c('labour', 'centre', 'colour', 'flavour', 'humour', 'neighbour',
        'neighbourhood', 'defence', 'offence', 'analyse', 'behaviour',
        'organisation', 'organisational')

US <- c('labor', 'center', 'color', 'flavor', 'humor', 'neighbor',
        'neighborhood', 'defense', 'offense', 'analyze', 'behavior',
        'organization', 'organizational')

soc_tokens <- tokens_replace(soc_tokens, UK, US)

coll <- textstat_collocations(soc_tokens, min_count = 50)
soc_tokens <- tokens_compound(soc_tokens, coll, join = FALSE)
docvars(soc_tokens) <- df %>% 
  select(abstract, source, year) %>% rename(text = 'abstract')

bigrams <- c('per_cent')
unigrams <- c('percent')
soc_tokens <- tokens_replace(soc_tokens, bigrams, unigrams)

dfm_soc <- soc_tokens %>% dfm() %>% dfm_select(min_nchar = 2L) %>% 
  dfm_trim(min_docfreq = 50) %>%  # minimum 50 documents (removes rare terms)
   dfm_trim(max_docfreq = 0.25,
            docfreq_type = 'prop') # maximum in 25 percent of documents (stop words)

soc_grouped <- dfm_group(dfm_soc, groups = 'source') # grouped dfm for keyness
```

The final document-feature matrix contains `r ndoc(dfm_soc)` observations (abstracts) and `r nfeat(dfm_soc)` different features (terms) for a total of `r sum(ntoken(dfm_soc))` tokens. A wordcloud illustrates the most commonly used terms in all abstracts, where more frequent terms are larger in size.

```{r fig.height=8, fig.width=8}
#png('output/socsci_wordcloud.png', res = 300,
 #   width =  7, height = 7, units = 'in')
textplot_wordcloud(dfm_soc, max_words = 100, color = 'black',
                   min_size = 0.1, max_size = 3)
#dev.off()
```

The output shows that the generalist journals cover a broad range of topics (e.g. *political, woman, work, cultural, ..*). Moreover, many of the most common terms could be labeled as "science slang" (e.g *analysis, theory, increase, mode*).


## Key terms for each journal

The following table shows [key](https://www.rdocumentation.org/packages/quanteda/versions/0.99.22/topics/keyness) terms for each journal (in comparison to all other journals) based upon chi-squared tests with Yates correction. These terms are appear more frequently in abstracts from the corresponding journal than what would be expected by chance. While this approach is sensitive to very rare terms in the entire corpus, the current feature selection from above (only keeping terms appearing in at least 50 abstracts) adds stability. For each term, the frequency within abstracts from the journals is included in parentheses.


```{r message=FALSE, warning=FALSE}
all_keys <- docnames(soc_grouped) %>% map(function(x) {
  textstat_keyness(soc_grouped, target = x, measure = 'chi2') %>% 
    mutate(target = x)}
) %>% bind_rows()


pro_keys <- all_keys  %>% select(feature,chi2, target, n_target) %>% 
                                  group_by(target) %>% 
arrange(desc(chi2)) %>% slice(1:10) %>%  
summarise(terms = str_c(feature,' (', n_target, '), ', collapse = '') %>% 
            str_sub(1L, -3L),
          target_counts = list(n_target)) %>% 
rename(journal = target, key_terms = terms)

pro_keys %>% select(journal, key_terms) %>% 
  knitr::kable(caption = 'Key terms for each journal (counts in parentheses).')

```

```{r, results = 'hide', eval = FALSE}
# stargazer table
stargazer(pro_keys %>% select(journal, key_terms),
          summary = FALSE, rownames = FALSE)
```

## Unidimensional scaling

The main focus of this work is to answer the question whether the quantitative-qualitative divide as potentially the most polarizing dimension for Sociology publications in these journals. Moreover, we want to investigate this polarization across journals and over time.

There are several methods for automated text analysis which could be used to shed light on this (e.g. Latent Semantic Analysis, Topic Modeling). In this case however, we apply a method which has originally been developed for Political Science applications, the [Wordfish](http://onlinelibrary.wiley.com/doi/10.1111/j.1540-5907.2008.00338.x/full) scaling model. Wordfish is commonly used by Political Scientists to position actors along a single ideological dimension. This could be for instance the economic position of parties, captured by the terms they use in party manifestos. The underlying statistical model is count based and assumes a Poisson distribution, with one parameter $\lambda$, of words. The functional form is as follows:

$$ wordcount_{ij} \sim Poisson(\lambda_{ij})$$
$$\lambda_{ij} = exp(\alpha_{i} + \psi_{j} + \beta_{j} * \theta_{i}) $$
Applied to the Sociology abstracts, this models the number of times abstract $i$ inludes term $j$.

* Alpha $\alpha$ is a document-level-fixed effect, controlling for some abstracts including more terms than others.
* Psi $\psi$ is word-fixed-effect, controlling for some terms being used more frequently than others.
* Beta $\beta$ is the estimate weight for each term used to position the documents on the one-dimensional scale.
* Theta $\theta$ is the estimate for each document (abstract) on the one-dimensional scale.

```{r, eval = FALSE}
# this code chunk will not be evaluated
wf_soc <- textmodel_wordfish(dfm_soc, dir = c(2, 1),
                             sparse = TRUE) # see github discussions
wf_terms <- tibble(feature = wf_soc$features, beta = wf_soc$beta, 
                 psi = wf_soc$psi)
wf_groups <- tibble(groups = wf_soc$docs, theta = wf_soc$theta,
                    theta_se = wf_soc$se.theta)

save(wf_soc, wf_terms,wf_groups,
     file = 'soc_journals_wordfish.RData')
save.image(compress = 'bzip2')
```

A priori, one can only assume what the single dimension modeled by wordfish will represent and after fitting the model, validation is necessary to understand what the estimated parameters actually capture. For this work, the a priori assumption was that the one dimensional scaling reveals a divide between qualitative and quantitative publications.

```{r message=FALSE, warning=FALSE, results='hide'}
# Reload libraries and data
library(tidyverse)
library(hrbrthemes)
library(quanteda)
library(ggrepel)
library(cowplot)
library(stargazer)
library(scales)
library(splines)
library(ggeffects)
library(texreg)

load('soc_journals_spacy.RData') # loads lemma
load('soc_journals_wordfish.RData') # loads scaling model
```

### Understanding the latent dimension


Parameter distributions:

```{r fig.height=6, fig.width=6}
p_psi <- wf_terms %>% ggplot(aes(x = psi)) +
  geom_density() + theme_ipsum(axis_title_size  = 12) + 
  labs(x = 'Term fixed effects (Psi)', y = 'Density')
p_beta <- wf_terms %>% ggplot(aes(x = beta)) +
  geom_density()  + theme_ipsum(axis_title_size  = 12) + 
  labs(x = 'Term weights (Beta)', y = 'Density')
p_theta <- wf_groups %>% ggplot(aes(x = theta)) +
  geom_density()  + theme_ipsum(axis_title_size  = 12) + 
  labs(x = '1d scaling positions (Theta)', y = 'Density')



p_grid <- plot_grid(p_psi, p_beta, p_theta,
           labels = c(NULL),
            label_size  = 12, label_fontface = 'plain',
         ncol = 1)
p_grid
# 
# ggsave('output/soc_journals_wf_params.png', p_grid, 
#        dpi = 300, height = 7, width = 7)
```


Term weights and fixed effects:

```{r}
wf_terms %>% arrange(beta) %>% 
  DT::datatable()
```


To understand which latent dimension is captured by the model, terms and their corresponding $\beta$ and $\phi$ values are examined. Visualizing and labeling all `r nfeat(dfm_soc)` terms is not feasible, therefore the following plot shows estimates for every term, but only labels for several of them.


```{r fig.height=7, fig.width=9}

# select terms to show in plot
to_show <- c('qualitative', 'quantitative', 'n=', 'ols', 'probit',
            'kindergarten',  'theorise', 'foucault', 'analysis',
            'organization', 'interview', 'recognise', 'bourdieu',
            'theory',  'hypothesis', 'causal_effect', 'change',
            'regression', 'discourse', 	'subjectivity', 'text',
            'knowledge', 'culture', 'radical', 'hispanic',
             'fix_effect', 'hegemony', 'unobserved', 'parent', 'man',
            'logit', 	 'nativity', 'full_time', 'sociology', 'politic',
            'social_theory', 'epistemology', 'marital', 'paternal',
            'migrant', 'cosmopolitanism', 'nonmarital', 'radical', 'protest',
            'black_woman', 'full_time', 'luhmann', 'social_movement',
            'feminist', 'mother', 'labor', 'work', 'public_policy',
            'survey', 'panel_study', 'relationship', 'matching',
            'family_structure', 'bourdieu', 'object', 'discursive',
            'discourse', 'intellectual', 'statistically_significant',
            'neoliberal', 'average', 'distribution', 'child', 'gender_role',
            'predict', 'income_dynamics', 'framing',  'multilevel',
            'ethnographic', 'logistic', 'civil_society', 'weber',
            'longitudinal_survey', 'ethnic', 'logistic', 'divorce', 'black',
            'cohabit', 'cohabitation', 'ground', 'woman', 'life_course')

p_terms <- wf_terms %>% 

  ggplot(aes(x = beta, y = psi)) +  
  geom_point(aes(color = beta)) + 
  scale_color_gradient2(low = '#fc8d62', high = '#66c2a5', mid = 'grey90') + 
  scale_y_continuous(expand = c(0.08,0.0),  breaks= pretty_breaks(n = 5),
                     limits = c(-10, -1)) + 
  scale_x_continuous(expand = c(0.08,0.00),
                     breaks= pretty_breaks(n = 6)) + 
  geom_text_repel(data = wf_terms %>% filter(feature %in% to_show),
                                        aes(label = feature),
                  force = 1, direction = 'both',
                  size = 3,
                  arrow = arrow(length = unit(0.01, "npc")),
                  segment.colour="grey10") + 
  guides('color' = "none")+ theme_ipsum(axis_title_size  = 12,
                                        grid="XY") +
  labs(title = 'Generalist Sociology Journals: a Qualitative-Quantitative Divide?',
       subtitle = 'Unidimensional scaling of abstract texts (1995-2017)',
       x = 'Term Weight (higher = more quantitative methodology)', y = 'Term Fixed Effect (higher = more frequent terms)',
       caption = 'Weights position terms on the 1d scale. Terms with high fixed effects occur more frequently.')
p_terms

 # ggsave('output/socsci_wordfish.png',
 #        plot = p_terms + labs(title = NULL, subtitle = NULL, caption = NULL),
 #        width =  9, height = 7, dpi = 300)
```


The next plot shows the distribution of the abstract level scaling positions (theta):

```{r message=FALSE, warning=FALSE}
mean_theta = mean(wf_groups$theta)
wf_groups %>% ggplot(aes(theta, fill = theta)) +
  geom_histogram() +
  theme_ipsum(axis_title_size  = 12) +
  geom_vline(xintercept=mean_theta, linetype=2)

# ggsave('output/socsci_theta_hist.png',
#        width =  8, height = 6, dpi = 300)
```


For further validation the following tables include abstracts with the most extrem positions ($\theta$) on the 1d scale. While these do not represent "average publications", inspecting them is very useful for making sense of the scaling model.

First, publications with low theta values, e.g. including rather orange than green terms from the plot above: 

```{r}
df %>% bind_cols(wf_groups) %>% arrange(theta) %>% 
  distinct(title, .keep_all = TRUE) %>% head(50) %>% 
  mutate(authors =  authors %>% map_chr( ~ str_c(.x, collapse = ', '))) %>% 
  select(theta, title, source, authors, year, abstract) %>% 
  mutate(theta = round(theta, 2)) %>% 
    DT::datatable(  options = list(
      lengthMenu = c(1, 2, 5)))
```

In comparison, these are publications from the other extreme of the spectrum (high $\theta$ values):


```{r}
df %>% bind_cols(wf_groups) %>% arrange(desc(theta)) %>% 
  distinct(title, .keep_all = TRUE) %>% head(50) %>% 
    mutate(authors =  authors %>% map_chr( ~ str_c(.x, collapse = ', '))) %>% 
    select(theta, title, source, authors, year, abstract) %>% 
    mutate(theta = round(theta, 2)) %>% 
    DT::datatable(  options = list(
      lengthMenu = c(1, 2, 5)))
```

## Journal and Time effects


### Regression framework

To analyze journal and time effects, three OLS regressions are used for the scaling estimates of all abstracts:

  - Model 1 with linear effect for year
  - Model 2 with polynomial spline for year (3 degrees of freedom)
  - Model 3 with polynomial spline for year (5 degrees of freedom)

#### Regression table

```{r message=FALSE, warning=FALSE, results='asis'}

df_reg <- df %>% bind_cols(wf_groups) %>% 
  mutate(year = as.numeric(year),
         source = as.factor(source) %>% 
           relevel(ref = 'american journal of sociology'))


reg <- lm(theta ~ source + bs(year), data = df_reg)
reg2 <- lm(theta ~ source +  bs(year, degree = 5), data = df_reg)
reg_lin <- lm(theta ~ source + year, data = df_reg)
names(reg$coefficients) <- str_replace_all(names(reg$coefficients),
                                           'source', '')
names(reg_lin$coefficients) <- str_replace_all(names(reg_lin$coefficients), 
                                           'source', '')
names(reg2$coefficients) <- str_replace_all(names(reg2$coefficients), 
                                           'source', '')

htmlreg(list(reg_lin, reg, reg2), stars = c(0.05, 0.01, 0.001),
          include.aic = TRUE, include.adjrs = TRUE,
          single.row = TRUE, include.rmse = TRUE,
       include.rsquared = TRUE) 
```


### Predictions

As the more complicated models with splines do not substantially improve model fit, the following predictions are based upon the linear model:

```{r fig.height=8, fig.width=6}
 p_year_reg <-  ggpredict(reg_lin, 'year') %>%  
   ggplot(aes(x = x, y = predicted)) + geom_point(size = 2) +
  geom_line() + 
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5), expand = c(0,0.05)) +
  scale_x_continuous( breaks = c(seq(1995, 2017, 1)), expand = c(0,0.05)) +
  theme_ipsum(axis_title_size  = 12, grid = 'XY') +
   theme(axis.text.x = element_text(angle = 90)) +
   labs( x = 'Year', y = 'Scaling Position\n(higher = more quantitative)')

# ggsave('output/socsci_wordfish_year_reg.png',
#         plot = p_year_reg,
#        width =  6, height = 9, dpi = 300)
 
 
p_source_reg <-  ggpredict(reg_lin, 'source') %>% mutate(x = get_x_labels(.)) %>% 
 ggplot(aes(x = reorder(x, predicted), y = predicted)) + 
  geom_point(size = 2) + geom_errorbar(aes(ymin = conf.low, ymax = conf.high),
                                         width = 0.25) + 
  coord_flip() + theme_ipsum(axis_title_size  = 12, grid = 'Xx',
                             axis = 'X') +
  scale_y_continuous(breaks = scales::pretty_breaks(8)) + 
     labs(title = 'Generalist Sociology Journals: a Qualitative-Quantitative Divide?',
       subtitle = 'Aggregated unidimensional scaling of abstract texts (1995-2017)',
       x = 'Journal', y = 'Scaling Position (higher = more quantitative)')


# ggsave('output/socsci_wordfish_source_reg.png',
#         plot = p_source_reg,
#        width =  6, height = 9, dpi = 300)
 

p_grid_reg <- plot_grid(p_source_reg + 
                          labs(title = NULL, subtitle = NULL, caption = NULL), 
          p_year_reg  + labs(title = NULL, subtitle = NULL, caption = NULL), 
           labels = c('(a)', '(b)'),
            label_size  = 12, label_fontface = 'plain',
         nrow = 2)
p_grid_reg

# ggsave('output/socsci_wordfish_combined_reg.png',
#         plot = p_grid_reg,
#        width =  8, height = 9, dpi = 300)
```

## Appendix



### Number of authors and methodology

We examine whether, as argued in our paper, the use of quantitative methodology correlates with a higher number of authors per paper.

```{r message=FALSE, warning=FALSE, results = 'hide'}
df_auth <- read_tsv('ssci_soc_journals_df_final.tsv') %>% 
  mutate(nr_authors = str_count(authors, ',') + 1 ) %>% 
  # count commata which separate authors
  bind_cols(wf_groups) %>% 
  select(authors, nr_authors, theta, title) %>% 
  drop_na() # drop 2 cases with mission author strings

df_auth %>% count(nr_authors, sort = TRUE)
```

We can now visualize the number of authors together with our scaling estimate for each publication in a scatter plot. Points are jittered slightly,a linear trend is shown with a blue line and publications with more than 7 authors are excluded:

```{r fig.height=6, fig.width=9, message=FALSE, warning=FALSE}
df_auth %>% ggplot(aes(x = nr_authors, y = theta)) + geom_jitter(alpha = 0.5) +
  geom_smooth(method = 'lm') + 
  theme_ipsum(axis_title_size  = 12, grid = 'XY', axis = 'XY') +
 
   scale_x_continuous(breaks = scales::pretty_breaks(9), limits = c(1,7)) +
  labs(x = 'Number of authors',
   y = 'Scaling position (higher = more quantitative)',
  caption = 'Publications with > 7 authors excluded for visualisation purposes')


# ggsave('output/socsci_nrauth.png',  width =  9, height = 6, dpi = 300)
```



The plot indicates that more quantitative methodology is indeed associated with a higher number of coauthors. This can also be quantified with a pearson correlation while also including the few publications with more than seven authors:

```{r message=FALSE, warning=FALSE}
options(scipen = 999)
library(psych)

auth_cor <- df_auth %>% select(theta, nr_authors) %>% corr.test()
print(corr.p(auth_cor$r, n = 8737), short = FALSE)
```

At last, we compare publications with one autor to coauthored papers and visualize the results with boxplots. The plot again suggests that for our dataset publications relying on quantitative methodology more often are written by multiple authors.

```{r fig.height=6, fig.width=9}
df_auth %>% 
  mutate(coauthored = ifelse(nr_authors > 1, 'Coauthored', 'Single author')) %>% 
  ggplot(aes(y = theta, x = coauthored)) +
  geom_boxplot() +
  theme_ipsum(axis_title_size  = 12, grid = 'XY', axis = 'XY') +
  labs(x = 'Authors',
   y = 'Scaling position (higher = more quantitative)')

ggsave('output/socsci_catauth.png',  width =  9, height = 6, dpi = 300)
```


### Country effects

To analyze whether the methodology of articles is also influenced by the nationality of authors, we utilize the affiliations for each publication.

#### Data preparation

First, we pass all affiliations to the Google Geocoding API. Please note that the corresponding raw data is not included in the replication material.

```{r message=FALSE, warning=FALSE, eval = FALSE}
#devtools::install_github("dkahle/ggmap", ref = "tidyup")
library(ggmap)
df <- read_tsv('ssci_soc_journals_df_final.zip')
register_google() # insert your api Key here

# pull and split affiliations
toparse <- df %>% filter(!is.na(affiliations)) %>%
  pull(affiliations) %>% str_split(';') %>%
  unlist() %>%
  str_trim() %>%
  unique()

# create list for results
df_list <- vector("list", length(toparse))

# iterate over all affilations and retrieve geocodes
for (i in 1:length(toparse)) {
  #cat("\014")
  #cat(toparse[i])
  
  Sys.sleep(0.1)
  result <-
    tryCatch(
      geocode(toparse[i], output = "more", source = "google"),
      warning = function(w)
        data.frame(
          affiliation = toparse[i],
          lon = NA,
          lat = NA,
          address = NA
        )
    )
  result$affiliation <- toparse[i]
  df_list[[i]] <- result
}

final <- bind_rows(df_list)
save.image('geocoding.RData') # store everything in an R environment
```

We then manually inspected the small number of cases where the API did not properly geocode the affiliations, corrected these cases and stored everything in a new dataset. This dataset includes the corresponding country for each affiliation.

```{r message=FALSE, warning=FALSE}

geo <- read_tsv('socjournals_geocoding.tsv')

geo %>% count(country, sort = TRUE)

counts <- geo %>% group_by(country) %>% 
  summarise(obs = n())
```


We merge the data with our main dataset and reshape it such that each row represents one affiliation. Some observations get lost in this process because affiliation data was not available for every article.


```{r message=FALSE, warning=FALSE}
# reloading the  main data frame
main <- read_tsv('ssci_soc_journals_df_final.tsv')
main <- main %>% mutate(affiliation = str_split(affiliations, ';'),
                                              affiliation = affiliation %>% 
                     map(str_trim)) %>% unnest(affiliation)

comb <- main %>% left_join(geo %>% select(affiliation,
                                        country, lon, lat))
comb %>% sample_n(20) %>% select(country, affiliation)
  
```

Now we create a country-based indicator:

```{r}
agg <- comb %>% 
  mutate(region = case_when(is.na(country) ~ 'No affiliation data',
                            country == 'United States' ~ country,
                            country == 'United Kingdom' ~ country,
                            TRUE ~ 'Other Countries'))
agg %>% count(region)
```

And aggregate for each publication:

```{r}
agg <- agg %>% group_by(eid) %>% 
  summarise(country_us = sum(region == 'United States'),
            country_uk = sum(region == 'United Kingdom'),
            country_other = sum(region == 'Other Countries'),
            country_none = sum(region == 'No affiliation data'),
            affiliations = first(affiliations),
            affil_count = ifelse(is.na(affiliations), NA, n()))

agg %>% count(affil_count, sort = TRUE) # number of authors per publication

agg %>% write_tsv('soc_journals_countrycounts.tsv')
```

#### Affiliation map

We can use the geocoded data to visualize affiliations for authors in our dataset by country.

```{r message=FALSE, warning=FALSE, results = 'hide'}
library(sf)
library(hrbrthemes)
library(rworldmap)
library(rworldxtra)
library(ggrepel)
#library(extrafont)
#loadfonts(device = "win")
world <- getMap(resolution = "high")
world <- st_as_sf(world)
```


The world data works with slightly different names of some countries, which is why they need to be adjusted first:

```{r message=FALSE, warning=FALSE}
adj_counts <- comb %>% count(country) %>% mutate(
  obs = n,
  ADMIN.1 = case_when(country == 'Czechia' ~ 'Czech Republic',
                      country == 'Hong Kong' ~ 'Hong Kong S.A.R.',
                      country == 'Macau' ~ 'Macau S.A.R' ,
                      country == 'Palestine' ~ 'Israel',
                      country == 'Serbia' ~ 'Republic of Serbia',
                      country == 'United States' ~ 'United States of America',
                      TRUE ~ country)
)

affil <- world %>% left_join(adj_counts)
affil <- affil %>% 
  mutate(obs = ifelse(is.na(obs), 0, obs))
```

Now we create a categorical variable for affiliation counts per country:

```{r}
affil <- affil %>% 
  mutate(obs_cat = case_when(
    obs == 0 ~ '0',
    obs > 0 & obs <= 50 ~ '1 - 50',
    obs > 51 & obs <= 100 ~ '51 - 100',
    obs > 100 & obs <= 250 ~ '101 - 250',
    obs > 250 & obs <= 1000 ~ '251 - 1000',
    obs > 1000  ~  '> 1000'
  ) %>% as_factor() %>% 
    fct_relevel('0', '1 - 50', '51 - 100', '101 - 250', '251 - 1000', '> 1000' ))

```

Finally, we visualize this and highlight the five countries with the most affiliations:

```{r fig.height=5, fig.width=9, message=FALSE, warning=FALSE}
affil <- affil %>% 
  mutate(NAME = as.character(NAME), 
         name_count = str_c(NAME, ' (n = ', as.character(obs), ')'),
         toshow = ifelse(obs > 300, 
                         name_count, NA)) 
map_plot <- ggplot(affil) +
    geom_sf(aes(fill = obs_cat)) +
    scale_fill_viridis_d() + 
  coord_sf( ylim = c(-40, 100), expand = FALSE) +
  geom_label_repel(aes(LON, LAT, label = toshow), size = 3.5, 
                   segment.size = 0.75, hjust = "right", 
        color = 'black', nudge_x = 70) +
  theme_ipsum(axis_title_size  = 12, grid = 'Xx',
                             axis = 'X') + 
  theme(legend.position = c(0.08, 0.31),
        legend.background = element_blank(),
        legend.box.background = element_rect(colour = "black")) +
  labs(fill = 'Affiliations', x = 'Longitude', y = 'Latitude') 
   
map_plot

# ggsave('output/socsci_countrymap.png',
#         plot = map_plot,
#        width =  9, height = 5, dpi = 300)
```


#### Country effect regressions

We load the country data again and merge it with our regression dataframe from above. Now, we can compute relative shares for each country. As an example, `rel_uk` would indicate the proportion of authors for each publication whose affiliation is based in the United Kingdom.

```{r message=FALSE, warning=FALSE}
geo <- read_tsv('soc_journals_countrycounts.tsv') %>%
  select(-affiliations) %>%
  left_join(df_reg, by = 'eid') %>%
  filter(country_none == 0) %>%
  mutate(rel_uk = (country_uk / affil_count),
         rel_us = (country_us / affil_count),
         rel_other = (country_other / affil_count))
```

With this data, we fit linear models to examine the country effects while controlling for journal and time effects:

```{r message=FALSE, warning=FALSE, results='asis'}
reg_g <-  lm(theta ~ source + year, data = geo)
names(reg_g$coefficients) <- str_replace_all(names(reg_g$coefficients),
                                           'source', '')
reg_g_2 <- lm(theta ~ source + rel_uk + rel_us + year, data = geo)
# rel_other is ommited because of collinearity
names(reg_g_2$coefficients) <- str_replace_all(names(reg_g_2$coefficients), 
                                           'source', '')


htmlreg(list(reg_g, reg_g_2), stars = c(0.05, 0.01, 0.001),
          include.aic = TRUE, include.adjrs = TRUE,
          single.row = TRUE, include.rmse = TRUE,
       include.rsquared = TRUE)
```

Overall, the country data barely improve predictions of our scaling estimates. The next graph shows that there is a minor, negative effect for UK affiliations (meaning a preference for qualitative work) and no effect for US affiliations.

```{r fig.height=8, fig.width=8}
 p_cntry_reg_uk <-  ggpredict(reg_g_2, 'rel_uk') %>%  
   ggplot(aes(x = x, y = predicted)) + geom_line(size = 2) +
  geom_line() + 
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  scale_y_continuous(limits = c(-0.45, -0.10), 
                     breaks = scales::pretty_breaks(n = 6), 
                     expand = c(0,0.05)) +
  scale_x_continuous(labels = scales::percent, expand = c(0,0.05)) +
  theme_ipsum(axis_title_size  = 12, grid = 'XY') +

   labs( x = 'Share of authors with United Kingdom affiliations',
         y = 'Scaling Position\n(higher = more quantitative)')


 p_cntry_reg_us <-  ggpredict(reg_g_2, 'rel_us') %>%  
   ggplot(aes(x = x, y = predicted)) + geom_line(size = 2) +
  geom_line() + 
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2) +
  scale_y_continuous(limits = c(-0.45, -0.10),
                     breaks = scales::pretty_breaks(n = 6), 
                     expand = c(0,0.05)) +
 scale_x_continuous(labels = scales::percent, expand = c(0,0.05)) +
  theme_ipsum(axis_title_size  = 12, grid = 'XY') +

   labs( x = 'Share of authors with United States affiliations', 
         y = 'Scaling Position\n(higher = more quantitative)')


p_grid_reg_g <- plot_grid(p_cntry_reg_uk + 
                            labs(title = NULL, subtitle = NULL, caption = NULL), 
          p_cntry_reg_us  + labs(title = NULL, subtitle = NULL, caption = NULL), 
           labels = c('(a)', '(b)'),
            label_size  = 12, label_fontface = 'plain',
         nrow = 2)
p_grid_reg_g

# ggsave('output/socsci_wordfish_geo_reg.png',
#         plot = p_grid_reg_g,
#        width =  8, height = 8, dpi = 300)
```


### Correspondence Analysis 

As another robustness check for our results, we used another dimensionality reduction technique, correspendence analysis, to scale the articles for two dimensions instead of only one. The results are very similar.



```{r}
soc_ca <- textmodel_ca(dfm_soc, nd = 2)
ca <- coef(soc_ca, doc_dim = c(1,2), feat_dim = c(1,2))

ca_terms <- ca$coef_feature %>% as_tibble() %>% 
  mutate(term =  rownames(ca$coef_feature),
         Dim1 = Dim1 * -1) # adjust order similar to that of the wordfish model

ca_docs <- ca$coef_document %>% as_tibble() %>%
  mutate(doc =  rownames(ca$coef_document),
         Dim1 = Dim1 * -1) %>% # adjust order similar to that of the wordfish model
  bind_cols(docvars(dfm_soc))
ca_docs$title <- df$title

```

#### Term level dimensions

```{r fig.height=7, fig.width=9}
ca_term_p <- ca_terms %>% 
  ggplot(aes(x = Dim1, y = Dim2)) +  
  geom_point(aes(color = Dim1)) + 
  scale_color_gradient2(low = '#fc8d62', high = '#66c2a5') + 
  scale_y_continuous(breaks= pretty_breaks(n = 5)) + 
  scale_x_continuous(breaks= pretty_breaks(n = 6)) + 
  geom_text_repel(data = ca_terms %>% filter(term %in% to_show),
                                        aes(label = term),
                  force = 1, direction = 'both',
                  size = 2.5,
                  arrow = arrow(length = unit(0.01, "npc")),
                  segment.colour="grey10")  +
  guides('color' = "none") + theme_ipsum(axis_title_size  = 12,
                                        grid="XY") +
  labs(x = 'Correspondence Analysis -  Dimension 1 (higher = more quantitative methodology)',
       y = 'Correspondence Analysis - Dimension 2')

ca_term_p

 # ggsave('output/socsci_ca_terms.png',
 #        plot = ca_term_p,
 #        width =  9, height = 7, dpi = 300)
```


#### Document level dimensions (two exemplary journals)

```{r fig.height=6, fig.width=8}
ca_doc_p <- ca_docs %>% 
  filter(source %in% c('social science research',
                                 'sociology')) %>% 
  ggplot(aes(x = Dim1, y = Dim2, color = source)) +
  geom_point(alpha = 0.5) + 
  geom_hline(yintercept = 0, linetype = 2) + 
  geom_vline(xintercept = 0, linetype = 2) +
  theme_ipsum(axis_title_size  = 12,
                                        grid="XY") +
    labs(x = 'Correspondence Analysis -  Dimension 1 (higher = more quantitative methodology)',
       y = 'Correspondence Analysis - Dimension 2')

ca_doc_p

# ggsave('output/socsci_ca_docs.png',
#       plot = ca_doc_p,
#       width =  9, height = 7, dpi = 300)
```


## R Environment

The following R package versions were used to for this document:

```{r message=FALSE, warning=FALSE}
library(jaod)
library(tidyverse)
library(hrbrthemes)
library(quanteda)
library(spacyr) 
library(ggrepel)
library(cowplot)
library(ggmap)
library(stargazer)
library(scales)
library(splines)
library(ggeffects)
library(texreg)
library(sf)
library(rworldmap)
library(rworldxtra)
library(extrafont)
library(psych)

sessionInfo()
```


